# ---- utils libs ----
import numpy as np
import pandas as pd
import datetime
from typing import Optional
# --- Import functions from utils.py ---
import sys
sys.path.insert(0,'../src')
from utils import plot_confusion_matrix, plot_activity_hist, load_dataset, load_aggregate_dataset, time_in_range, segmentDf, create_sequence, train_test_split_dataset, convertToSequenceParameters
# ---- Data Viz libs ----
from matplotlib import pyplot as plt
import plotly.graph_objects as go
import seaborn as sns
# ---- ML libs ----
from sklearn.preprocessing import StandardScaler
# ---- Deep Learning libs ----
from tensorflow import keras
from tensorflow.keras import layers
dataset_resampled_1H = load_dataset("house1_power_blk2_labels.zip","60min")
df_activity_resampled_1H = dataset_resampled_1H["activity"]
dataset_resampled_1H = dataset_resampled_1H["mains"]
df_mains_resampled_1H = pd.DataFrame(dataset_resampled_1H)
df_mains_resampled_1H
Résumé du dataset en prenant un point toute les heures
fig = go.Figure()
fig.add_trace(go.Scatter(x=df_mains_resampled_1H.index, y=df_mains_resampled_1H['mains'], name='Load Curve'))
fig.update_layout(showlegend=True, title='house1_power_blk2_labels_60min')
fig.show()
Remarque ci-dessus : l'activité électrique diminue globalement pour le fichier "house1_power_blk2_labels", cela peut poser problème pour la séparation train/test
3D-array [samples, SEQUENCE_LENGTH, features] (i.e sequences from the timeseries) , as required for LSTM network. We want our network to have memory of 10 days, so we set SEQUENCE_LENGTH=10.¶TIME_STEP = datetime.timedelta(minutes=1, seconds=30) # duration of a step in the resample dataset, originally 1 second
DURATION_TIME = datetime.timedelta(minutes=60) # duration of a sequence
OVERLAP_PERIOD_PERCENT = 0.8 # 0.5 <=> 50% overlapping
TIMEFRAMES = [(datetime.time(0,0,0), datetime.time(3,0,0))] # timeframes we consider as unactive
STRATEGY = "off_peak_time" # device, off_peak_time, label
SEQUENCE_LENGTH, OVERLAP_PERIOD = convertToSequenceParameters(TIME_STEP, DURATION_TIME, OVERLAP_PERIOD_PERCENT)
print("\t\tValeur choisie \t Equivalent sequence\nTimestep : \t {}\nDuration :\t {} \t --> {} \nOverlap :\t {} \t\t --> {}".format(TIME_STEP, DURATION_TIME, SEQUENCE_LENGTH, OVERLAP_PERIOD_PERCENT, OVERLAP_PERIOD))
df_mains_resampled_1H.loc[:, ['mains']]
def preprocessing(timeframes: list
,sequence_length: int, overlap_period: int
,resample_period :Optional[str]=None
,use_labels :Optional[bool]=False
,strategy :Optional[str] = "off_peak_time"
,split_rate :Optional[float]=0.2) -> np.array:
"""
1/ Loads the dataset and resample timeseries
2/ Split a dataframe into train set and test set according to the split rate
3/ Standardize Data
4/ Construction of the dataset according to peak and off-peak hours
or according to activity labels
5/ Creation of sequences of length T and according to the overlapping period
Args:
- resample_period: (optional) the reasmple period, if None the default period of 1 second will be used
- timeframes: list of tuples indicating the periods of the day ex: timeframes = [(datetime.time(10,0,0), datetime.time(6,0,0)), (datetime.time(12,0,0), datetime.time(13,0,0))
- use_labels: (False by default) use the activities labels
- sequence_length: length of the sequence
- overlap_period: overlap the sequences of timeseries
- device_approach: the aggregated load curve of the devices which, when in operation, do not allow us to predict an activity
- split_rate: Rate of the test set size
- device_strategy: use inactive devices base load curve
Returns:
- list of prepocessed 3D-array [samples, sequence_length, features] (i.e sequences from the timeseries)
"""
# load dataset with labels and resampled timeseries
df_resampled = load_dataset("house1_power_blk2_labels.zip", resample_period)
# split dataframe into train set and test set
train_df, test_df = train_test_split_dataset(df_resampled)
# Standardize Data
scaler = StandardScaler()
scaler_train = scaler.fit(train_df.loc[:, ['mains']])
train_df.loc[:, 'mains'] = scaler_train.transform(train_df.loc[:, ['mains']])
test_df.loc[:, 'mains'] = scaler_train.transform(test_df.loc[:, ['mains']])
# ---- TEST SEQUENCES ----
X_sequences_test, y_sequences_test = create_sequence(test_df, sequence_length, overlap_period)
print('Duplicates in test_df : ', test_df.duplicated().any())
if STRATEGY == "device":
# load dataset with labels and resampled timeseries
df_resampled_with_labels = load_dataset("house1_power_blk2_labels.zip", resample_period)
# load dataset with inactive devices
df_resampled_devices_inactive = load_aggregate_dataset("house1_power_blk2.zip", "inactive_house2", resample_period)
activity = df_resampled_with_labels["activity"]
df_resampled_device = df_resampled_devices_inactive.join(activity)
df_resampled_device['mains'] = scaler_train.transform(df_resampled_device[['mains']])
# --- TRAIN SEQUENCES ----
X_sequence_train_device, y_sequence_train_device = create_sequence(df_resampled_device, sequence_length, overlap_period)
return df_resampled_device, test_df, X_sequence_train_device, y_sequence_train_device, X_sequences_test, y_sequences_test
if STRATEGY == "label":
# load dataset with labels and resampled timeseries
df_resampled_with_labels = load_dataset("house1_power_blk2_labels.zip", resample_period)
df_resampled_with_labels = df_resampled_with_labels[df_resampled_with_labels.activity == 0]
df_resampled_with_labels['mains'] = scaler_train.transform(df_resampled_with_labels[['mains']])
# --- TRAIN SEQUENCES ----
X_sequence_train_label, y_sequence_train_label = create_sequence(df_resampled_with_labels, sequence_length, overlap_period)
return df_resampled_with_labels, test_df, X_sequence_train_label, y_sequence_train_label, X_sequences_test, y_sequences_test
if STRATEGY == "off_peak_time":
# --- TRAIN SEQUENCES ----
# Construction of the dataset according to peak and off-peak hours
list_df_train = segmentDf(train_df, timeframes = timeframes)
# init 3D-array [samples, sequence_length, features]
first_df_train = list_df_train[0]
list_X_sequence_train, list_y_sequence_train = create_sequence(first_df_train, sequence_length, overlap_period)
list_df_train.pop(0) # delete the first element of the list of train dataframes
# Creation of sequences of length T and according to the overlapping period
for df_train_ in list_df_train:
X_sequences_train, y_sequences_train = create_sequence(df_train_, sequence_length, overlap_period)
list_X_sequence_train = np.append(list_X_sequence_train, X_sequences_train, axis = 0)
list_y_sequence_train = np.append(list_y_sequence_train, y_sequences_train, axis = 0)
return list_df_train, train_df, test_df, list_X_sequence_train, list_y_sequence_train, X_sequences_test, y_sequences_test
if STRATEGY == "device":
resampled_device_train_df, test_df, X_train, y_train, X_test, y_test = preprocessing(
timeframes = TIMEFRAMES
,sequence_length = SEQUENCE_LENGTH
,overlap_period = OVERLAP_PERIOD
,resample_period = TIME_STEP
,strategy = STRATEGY)
if STRATEGY == "label":
resampled_label_train_df, test_df, X_train, y_train, X_test, y_test = preprocessing(
timeframes = TIMEFRAMES
,sequence_length = SEQUENCE_LENGTH
,overlap_period = OVERLAP_PERIOD
,resample_period = TIME_STEP
,strategy = STRATEGY)
if STRATEGY == "off_peak_time":
print("Strategy chosen : off_peak_time")
list_df_train, train_df, test_df, X_train, y_train, X_test, y_test = preprocessing(
timeframes = TIMEFRAMES
,sequence_length = SEQUENCE_LENGTH
,overlap_period = OVERLAP_PERIOD
,resample_period = TIME_STEP
,strategy = STRATEGY)
if STRATEGY == "device":
print("---- train DataFrame Device shape ----")
print(resampled_device_train_df.shape)
if STRATEGY == "label":
print("---- train DataFrame Label shape ----")
print(resampled_label_train_df.shape)
if STRATEGY == "off_peak_time":
print("---- train DataFrame Off-Peak-Time shape ----")
print(train_df.shape)
print("---- test DataFrame shape ----")
test_df.shape
train_df¶if STRATEGY == "device":
fig = go.Figure()
fig.add_trace(go.Scatter(x=resampled_device_train_df.index, y=resampled_device_train_df['mains'], name='Load Curve'))
fig.update_layout(showlegend=True, title='resampled_device_train_df')
fig.show()
if STRATEGY == "label":
fig = go.Figure()
fig.add_trace(go.Scatter(x=resampled_label_train_df.index, y=resampled_label_train_df['mains'], name='Load Curve'))
fig.update_layout(showlegend=True, title='resampled_label_train_df')
fig.show()
if STRATEGY == "off_peak_time":
fig = go.Figure()
fig.add_trace(go.Scatter(x=train_df.index, y=train_df['mains'], name='Load Curve'))
fig.update_layout(showlegend=True, title='train_df')
fig.show()
test_df¶fig = go.Figure()
fig.add_trace(go.Scatter(x=test_df.index, y=test_df['mains'], name='Load Curve'))
fig.update_layout(showlegend=True, title='test_df')
fig.show()
fig = go.Figure()
fig.add_trace(go.Scatter(x=train_df.index, y=train_df['mains'], name='Train'))
fig.add_trace(go.Scatter(x=test_df.index, y=test_df['mains'], name='Test'))
fig.update_layout(showlegend=True, title='train test split of Dataframe',xaxis_title="Date",
yaxis_title="mains (power, normalized)",
legend_title="Dataframe")
fig.show()
print("train_df[mains].mean()", train_df["mains"].mean())
print("test_df[mains].mean()", test_df["mains"].mean())
list_df_train¶if STRATEGY == "off_peak_time":
fig = go.Figure()
fig.add_trace(go.Scatter(x=list_df_train[6].index, y=list_df_train[6]['mains'], name='Load Curve'))
fig.update_layout(showlegend=True, title='list_df_train'+str(2))
fig.show()
print("TIME_STEP value : ", TIME_STEP)
print("---- X_train sequence shape ----")
print(X_train.shape)
print("\n---- y_train sequence shape ----")
print(y_train.shape)
print("\n\n---- X_test sequence shape ----")
print(X_test.shape)
print("\n---- y_test sequence shape ----")
print(y_test.shape)
print("---- X_train sequence ----")
X_train
print("---- y_train sequence ----")
y_train
print("---- X_test sequence ----")
X_test
print("---- y_test sequence ----")
y_test
We will build a convolutional reconstruction autoencoder model. The model will take input of shape (batch_size, sequence_length, num_features) and return output of the same shape. In this case, sequence_length is 10 and num_features is 1.
X_train.shape # 3d Array (samples, SEQUENCE_LENGTH, num_features)
X_train.shape[1]
X_train.shape[2]
model = keras.Sequential(
[
layers.Input(shape=(X_train.shape[1], X_train.shape[2])),
layers.Conv1D(
filters=32, kernel_size=7, padding="same", strides=2, activation="relu"
),
layers.Dropout(rate=0.2),
layers.Conv1D(
filters=16, kernel_size=7, padding="same", strides=2, activation="relu"
),
layers.Conv1DTranspose(
filters=16, kernel_size=7, padding="same", strides=2, activation="relu"
),
layers.Dropout(rate=0.2),
layers.Conv1DTranspose(
filters=32, kernel_size=7, padding="same", strides=2, activation="relu"
),
layers.Conv1DTranspose(filters=1, kernel_size=4, padding="same"),
]
)
model.compile(optimizer=keras.optimizers.Adam(learning_rate=0.001), loss="mse")
model.summary()
Please note that we are using X_train as both the input and the target since this is a reconstruction model.
history = model.fit(
X_train,
X_train,
epochs=50,
batch_size=128,
validation_split=0.1,
callbacks=[
keras.callbacks.EarlyStopping(monitor="val_loss", patience=5, mode="min")
],
)
Let's plot training and validation loss to see how the training went.
plt.plot(history.history["loss"], label="Training Loss")
plt.plot(history.history["val_loss"], label="Validation Loss")
plt.legend()
plt.title("Training & Validation Loss Evolution\n")
plt.show()
We will detect anomalies by determining how well our model can reconstruct the input data.
1/ Find MAE loss on training samples.
2/ Find max MAE loss value. This is the worst our model has performed trying to reconstruct a sample. We will make this the threshold for anomaly detection.
3/ If the reconstruction loss for a sample is greater than this threshold value then we can infer that the model is seeing a pattern that it isn't familiar with. We will label this sample as an anomaly.
X_train_pred = model.predict(X_train)
X_train_pred.shape
train_mae_loss = np.mean(np.abs(X_train_pred - X_train), axis=1)
train_mae_loss.shape
# Get train MAE loss.
X_train_pred = model.predict(X_train)
train_mae_loss = np.mean(np.abs(X_train_pred - X_train), axis=1)
plt.figure(figsize = (10, 7))
sns.distplot(train_mae_loss, bins=50, kde = True)
plt.xlabel("Train MAE loss")
plt.ylabel("No of samples")
plt.show()
# Get reconstruction loss threshold.
threshold = np.max(train_mae_loss)
print("Reconstruction error threshold: ", threshold)
Just for fun, let's see how our model has recontructed the first sample. This is the 40 timesteps from day 1 of our training dataset.
# Checking how the first sequence is learnt
plt.figure(figsize = (10, 5))
plt.plot(X_train[1], label="real load curve")
plt.plot(X_train_pred[1], label="reconstructed load curve")
plt.title("Reconstruction load curve comparison\n", fontsize=15)
plt.legend()
plt.show()
df_test_value = test_df["mains"]
fig, ax = plt.subplots()
df_test_value.plot(legend=False, ax=ax)
plt.show()
print("Test input shape: ", X_test.shape)
# Get test MAE loss.
X_test_pred = model.predict(X_test)
test_mae_loss = np.mean(np.abs(X_test_pred - X_test), axis=1)
test_mae_loss = test_mae_loss.reshape((-1))
plt.figure(figsize = (10, 5))
sns.distplot(test_mae_loss, bins=50)
plt.xlabel("test MAE loss")
plt.ylabel("No of samples")
plt.show()
# Detect all the samples which are anomalies.
anomalies = test_mae_loss > threshold
print("Number of anomaly samples: ")
print(np.sum(anomalies))
print("\n\nIndices of anomaly samples: ")
print(np.where(anomalies))
anomalies_idx = np.where(anomalies)
y_test[anomalies_idx[0][0]]
y_test[anomalies_idx[0][1]]
# get index of each sequence considered as an anomaly
sequences_anomalies_idx = list()
for i in range(len(anomalies)):
if anomalies[i] == True:
sequences_anomalies_idx.append(i)
# get index of each data point from X_test considered as an anomaly
data_anomalies_idx = list()
for elm in sequences_anomalies_idx:
for i in range(SEQUENCE_LENGTH):
data_idx = y_test[elm][i][2]
data_anomalies_idx.append(data_idx)
print("Number of data points considered as anomalies (= activity) : ", len(data_anomalies_idx))
threshold as an anomalie¶df_subset = df_test_value.iloc[data_anomalies_idx]
fig = go.Figure()
fig.add_trace(go.Scatter(x=df_test_value.index, y=df_test_value.values, name='Normal'))
fig.add_trace(go.Scatter(x=df_subset.index, y=df_subset.values, mode='markers', name='Anomaly = Activity (Predicted)'))
fig.update_layout(showlegend=True, title='Detected anomalies')
fig.show()
threshold of each sequences considered as an anomaly¶Ici : pour chaque séquence considérée comme une anomalie, on propose de visualiser les points pour lesquels l'écart est le plus important entre le point original et le point reconstruit.
# a = vecteur des timestamps en anomalie
a = y_test[:, :, 0][np.abs((X_test_pred - X_test)>threshold).squeeze()]
a = np.unique(a, return_counts=True)
df_subset = df_test_value.iloc[data_anomalies_idx]
df_subset = df_subset[df_subset.index.isin(a[0])] # a[0] car a[1] contient les comptes
fig = go.Figure()
fig.add_trace(go.Scatter(x=df_test_value.index, y=df_test_value.values, name='Normal'))
fig.add_trace(go.Scatter(x=df_subset.index, y=df_subset.values, mode='markers', name='Anomaly = Activity (Predicted)'))
fig.update_layout(showlegend=True, title='Detected anomalies')
fig.show()
Pas correct en l'état, à mettre à jour
df_subset_pred = pd.DataFrame(df_subset)
df_subset_pred["activity_pred"] = 1
df_subset_pred
fig, ax = plt.subplots()
plot_activity_hist(df_subset_pred['activity_pred'], figsize=(12, 6), alpha=0.5, label='predictions', ax=ax)
plot_activity_hist(test_df["activity"], figsize=(12, 6), alpha=0.5, label='truth', color='tab:orange', ax=ax)
test_df_eval = test_df.copy()
test_df_eval["activity"] = 0
idx_anom = df_subset_pred.index
test_df_eval.loc[idx_anom, "activity"] = 1
test_df_eval
sns.histplot(data=test_df_eval, x="activity").set(title='Activity prediction distribution (Activity VS Non Activity)')
test_df_eval.activity.value_counts()
plot_confusion_matrix(test_df["activity"], test_df_eval['activity'])
gdfgfgdfdf
%%time
def post_processing(y_test, sequence_length, data_anomalies_idx):
"""
Post process the model prediction with different strategies
Args:
- y_test: 3D-array that contain sequence of timestamp, activity labels and index (Ex: [Timestamp('2016-04-25 08:48:00'), 0, 0])
- sequence_length : The length of the y_test sequence
- data_anomalies_idx : The list of index of each sequence predict as an anomaly by the model
Returns:
- DataFrame:
- Timestamp: datetime of the time series
- list_idx_sequence_no_activity : index of the sequence for which no activity is predicted
- list_idx_sequence_activity : index of the sequence for which activity is predicted
- nb_no_activity : number of times the timestamp is in a sequence for which no activity has been predicted by the model
- nb_activity : number of times the timestamp is in a sequence for which activity has been predicted by the model
- total : nb_no_activity + nb_activity
- method_prediction_1 : Process a Majority Vote between no_activity_rate attribute and activity_rate attribute
- Export DataFrame to pickle format
"""
# Init dataframe with prediction
data_prediction = pd.DataFrame(columns=['Timestamp',
'list_idx_sequence_no_activity',
'list_idx_sequence_activity',
'nb_no_activity',
'nb_activity',
'total'])
timestamp_list = list()
for i in range(y_test.shape[0]):
for k in range(sequence_length):
timestamp_list.append(y_test[i][k][0])
# drop duplicate
timestamp_list = list(dict.fromkeys(timestamp_list))
print(len(timestamp_list))
counter = 0
for timestamp in timestamp_list:
counter = counter + 1
print(str(timestamp) + ", " + str(counter))
list_idx_sequence_no_activity = list()
list_idx_sequence_activity = list()
for i in range(y_test.shape[0]):
for k in range(sequence_length):
if timestamp == y_test[i][k][0]:
if i in sequences_anomalies_idx:
list_idx_sequence_activity.append(i)
else:
list_idx_sequence_no_activity.append(i)
data_prediction = data_prediction.append({'Timestamp': timestamp,
'list_idx_sequence_no_activity': list_idx_sequence_no_activity,
'list_idx_sequence_activity': list_idx_sequence_activity,
'nb_no_activity': len(list_idx_sequence_no_activity),
'nb_activity': len(list_idx_sequence_activity),
'total': len(list_idx_sequence_no_activity) + len(list_idx_sequence_activity)}
,ignore_index=True)
### Majortity vote post process strategy ###
# Process a **Majority Vote** between no_activity_rate attribute and activity_rate attribute
data_prediction["method_prediction_1"] = np.where(data_prediction["nb_activity"] > data_prediction["nb_no_activity"], 1, 0)
# Export prediction to .pickle format
data_prediction.to_pickle("./data_prediction.pkl")
return data_prediction
data_prediction = post_processing(y_test, SEQUENCE_LENGTH, data_anomalies_idx)
data_prediction
data_prediction.total.value_counts()